Lecture 13
| SciPy Method | Description | Gradient | Hessian |
|---|---|---|---|
| — | Gradient Descent (naive) | ✓ | ✗ |
| — | Newton’s method (naive) | ✓ | ✓ |
| — | Conjugate Gradient (naive) | ✓ | ✓ |
"CG" |
Nonlinear Conjugate Gradient (Polak and Ribiere variation) | ✓ | ✗ |
"Newton-CG" |
Truncated Newton method (Newton w/ CG step direction) | ✓ | Optional |
"BFGS" |
Broyden, Fletcher, Goldfarb, and Shanno (Quasi-newton method) | Optional | ✗ |
"L-BFGS-B" |
Limited-memory BFGS (Quasi-newton method) | Optional | ✗ |
"Nelder-Mead" |
Nelder-Mead simplex reflection method | ✗ | ✗ |
def define_methods(x0, f, grad, hess, tol=1e-8):
return {
"naive_newton": lambda: newtons_method(x0, f, grad, hess, tol=tol),
"naive_cg": lambda: conjugate_gradient(x0, f, grad, hess, tol=tol),
"CG": lambda: optimize.minimize(f, x0, jac=grad, method="CG", tol=tol),
"newton-cg": lambda: optimize.minimize(f, x0, jac=grad, hess=None, method="Newton-CG", tol=tol),
"newton-cg w/ H": lambda: optimize.minimize(f, x0, jac=grad, hess=hess, method="Newton-CG", tol=tol),
"bfgs": lambda: optimize.minimize(f, x0, jac=grad, method="BFGS", tol=tol),
"bfgs w/o G": lambda: optimize.minimize(f, x0, method="BFGS", tol=tol),
"l-bfgs-b": lambda: optimize.minimize(f, x0, method="L-BFGS-B", tol=tol),
"nelder-mead": lambda: optimize.minimize(f, x0, method="Nelder-Mead", tol=tol)
} naive_newton naive_cg CG ... bfgs w/o G l-bfgs-b nelder-mead
0 0.021096 0.035406 0.011022 ... 0.063204 0.032869 0.135478
1 0.020689 0.035320 0.010575 ... 0.063190 0.032366 0.134706
2 0.020797 0.035483 0.010438 ... 0.062650 0.033036 0.134936
3 0.020783 0.035671 0.010599 ... 0.062381 0.032441 0.134595
4 0.020857 0.035638 0.010403 ... 0.063007 0.032527 0.135014
5 0.020765 0.035697 0.010402 ... 0.064476 0.032307 0.134793
6 0.020727 0.035670 0.010464 ... 0.062205 0.033232 0.134989
7 0.020650 0.035522 0.010461 ... 0.063753 0.032421 0.135109
8 0.020701 0.035572 0.010389 ... 0.063448 0.032707 0.135807
9 0.020657 0.035525 0.010470 ... 0.062930 0.033151 0.135847
[10 rows x 9 columns]
def time_cost_func(n, x0, name, cost_func, *args):
x0 = (1.6, 1.1)
f, grad, hess = cost_func(*args)
methods = define_methods(x0, f, grad, hess)
return ( pd.DataFrame({
key: timeit.Timer(
methods[key]
).repeat(n, n)
for key in methods
})
.melt()
.assign(cost_func = name)
)
df = pd.concat([
time_cost_func(10, x0, "Well-cond quad", mk_quad, 0.7),
time_cost_func(10, x0, "Ill-cond quad", mk_quad, 0.02),
time_cost_func(10, x0, "Rosenbrock", mk_rosenbrock)
])
df variable value cost_func
0 naive_newton 0.002192 Well-cond quad
1 naive_newton 0.002107 Well-cond quad
2 naive_newton 0.002074 Well-cond quad
3 naive_newton 0.002083 Well-cond quad
4 naive_newton 0.002064 Well-cond quad
.. ... ... ...
85 nelder-mead 0.022829 Rosenbrock
86 nelder-mead 0.022748 Rosenbrock
87 nelder-mead 0.022783 Rosenbrock
88 nelder-mead 0.022777 Rosenbrock
89 nelder-mead 0.022710 Rosenbrock
[270 rows x 3 columns]
import cProfile
f, grad, hess = mk_quad(0.7)
def run():
optimize.minimize(fun = f, x0 = (1.6, 1.1), jac=grad, method="BFGS", tol=1e-11)
cProfile.run('run()', sort="tottime") 1293 function calls in 0.001 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
9 0.000 0.000 0.001 0.000 _linesearch.py:31(line_search_wolfe1)
1 0.000 0.000 0.001 0.001 _optimize.py:1318(_minimize_bfgs)
58 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}
152 0.000 0.000 0.000 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
9 0.000 0.000 0.000 0.000 _linesearch.py:91(scalar_search_wolfe1)
10 0.000 0.000 0.000 0.000 <string>:2(f)
37 0.000 0.000 0.000 0.000 fromnumeric.py:69(_wrapreduction)
21 0.000 0.000 0.000 0.000 shape_base.py:23(atleast_1d)
26 0.000 0.000 0.000 0.000 _optimize.py:235(vecnorm)
10 0.000 0.000 0.000 0.000 <string>:8(gradient)
20 0.000 0.000 0.000 0.000 numeric.py:2407(array_equal)
1 0.000 0.000 0.001 0.001 _minimize.py:45(minimize)
26 0.000 0.000 0.000 0.000 fromnumeric.py:2188(sum)
1 0.000 0.000 0.000 0.000 _differentiable_functions.py:86(__init__)
51 0.000 0.000 0.000 0.000 <__array_function__ internals>:177(dot)
84 0.000 0.000 0.000 0.000 {built-in method numpy.asarray}
9 0.000 0.000 0.000 0.000 _linesearch.py:77(derphi)
9 0.000 0.000 0.000 0.000 _linesearch.py:73(phi)
10 0.000 0.000 0.000 0.000 _differentiable_functions.py:132(fun_wrapped)
9 0.000 0.000 0.001 0.000 _optimize.py:1144(_line_search_wolfe12)
37 0.000 0.000 0.000 0.000 fromnumeric.py:70(<dictcomp>)
26 0.000 0.000 0.000 0.000 <__array_function__ internals>:177(sum)
21 0.000 0.000 0.000 0.000 <__array_function__ internals>:177(atleast_1d)
20 0.000 0.000 0.000 0.000 {built-in method numpy.arange}
1 0.000 0.000 0.001 0.001 {built-in method builtins.exec}
10 0.000 0.000 0.000 0.000 _differentiable_functions.py:162(grad_wrapped)
10 0.000 0.000 0.000 0.000 _differentiable_functions.py:264(fun)
21 0.000 0.000 0.000 0.000 <__array_function__ internals>:177(copy)
21 0.000 0.000 0.000 0.000 {built-in method numpy.array}
1 0.000 0.000 0.000 0.000 twodim_base.py:162(eye)
20 0.000 0.000 0.000 0.000 <__array_function__ internals>:177(array_equal)
1 0.000 0.000 0.000 0.000 linalg.py:2342(norm)
21 0.000 0.000 0.000 0.000 function_base.py:871(copy)
9 0.000 0.000 0.000 0.000 _differentiable_functions.py:240(update_x)
10 0.000 0.000 0.000 0.000 fromnumeric.py:2703(amax)
20 0.000 0.000 0.000 0.000 {method 'all' of 'numpy.ndarray' objects}
19 0.000 0.000 0.000 0.000 {built-in method numpy.zeros}
10 0.000 0.000 0.000 0.000 _differentiable_functions.py:270(grad)
51 0.000 0.000 0.000 0.000 multiarray.py:740(dot)
20 0.000 0.000 0.000 0.000 _methods.py:61(_all)
20 0.000 0.000 0.000 0.000 {method 'copy' of 'numpy.ndarray' objects}
1 0.000 0.000 0.000 0.000 _optimize.py:244(_prepare_scalar_function)
10 0.000 0.000 0.000 0.000 <__array_function__ internals>:177(amax)
41 0.000 0.000 0.000 0.000 {built-in method builtins.isinstance}
11 0.000 0.000 0.000 0.000 _differentiable_functions.py:249(_update_fun)
10 0.000 0.000 0.000 0.000 {method 'astype' of 'numpy.ndarray' objects}
10 0.000 0.000 0.000 0.000 _differentiable_functions.py:154(update_fun)
1 0.000 0.000 0.001 0.001 <string>:1(run)
10 0.000 0.000 0.000 0.000 _differentiable_functions.py:166(update_grad)
1 0.000 0.000 0.000 0.000 _minimize.py:973(standardize_constraints)
11 0.000 0.000 0.000 0.000 _differentiable_functions.py:254(_update_grad)
1 0.000 0.000 0.000 0.000 {method 'dot' of 'numpy.ndarray' objects}
37 0.000 0.000 0.000 0.000 {method 'items' of 'dict' objects}
10 0.000 0.000 0.000 0.000 numeric.py:1878(isscalar)
1 0.000 0.000 0.000 0.000 <__array_function__ internals>:177(atleast_2d)
26 0.000 0.000 0.000 0.000 fromnumeric.py:2183(_sum_dispatcher)
1 0.000 0.000 0.000 0.000 shape_base.py:81(atleast_2d)
21 0.000 0.000 0.000 0.000 function_base.py:867(_copy_dispatcher)
24 0.000 0.000 0.000 0.000 {built-in method builtins.len}
20 0.000 0.000 0.000 0.000 numeric.py:2403(_array_equal_dispatcher)
1 0.000 0.000 0.000 0.000 fromnumeric.py:2333(any)
9 0.000 0.000 0.000 0.000 {built-in method builtins.min}
22 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}
21 0.000 0.000 0.000 0.000 shape_base.py:19(_atleast_1d_dispatcher)
22 0.000 0.000 0.000 0.000 {built-in method numpy.asanyarray}
1 0.000 0.000 0.000 0.000 {method 'flatten' of 'numpy.ndarray' objects}
1 0.000 0.000 0.000 0.000 <__array_function__ internals>:177(any)
1 0.000 0.000 0.000 0.000 {method 'reshape' of 'numpy.ndarray' objects}
10 0.000 0.000 0.000 0.000 fromnumeric.py:2698(_amax_dispatcher)
9 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}
1 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}
1 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}
1 0.000 0.000 0.000 0.000 <__array_function__ internals>:177(norm)
1 0.000 0.000 0.000 0.000 _base.py:1301(isspmatrix)
1 0.000 0.000 0.001 0.001 <string>:1(<module>)
1 0.000 0.000 0.000 0.000 {method 'ravel' of 'numpy.ndarray' objects}
7 0.000 0.000 0.000 0.000 {built-in method builtins.callable}
1 0.000 0.000 0.000 0.000 {method 'lower' of 'str' objects}
1 0.000 0.000 0.000 0.000 _optimize.py:216(_check_unknown_options)
1 0.000 0.000 0.000 0.000 _methods.py:55(_any)
2 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}
1 0.000 0.000 0.000 0.000 linalg.py:117(isComplexType)
2 0.000 0.000 0.000 0.000 {built-in method _operator.index}
1 0.000 0.000 0.000 0.000 {method 'setdefault' of 'dict' objects}
1 0.000 0.000 0.000 0.000 fromnumeric.py:2328(_any_dispatcher)
1 0.000 0.000 0.000 0.000 linalg.py:2338(_norm_dispatcher)
1 0.000 0.000 0.000 0.000 _optimize.py:324(hess)
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
1 0.000 0.000 0.000 0.000 shape_base.py:77(_atleast_2d_dispatcher)
from pyinstrument import Profiler
f, grad, hess = mk_quad(0.7)
profiler = Profiler(interval=0.00001)
profiler.start()
opt = optimize.minimize(fun = f, x0 = (1.6, 1.1), jac=grad, method="BFGS", tol=1e-11)
p = profiler.stop()
profiler.print(show_all=True)
_ ._ __/__ _ _ _ _ _/_ Recorded: 10:56:16 Samples: 253
/_//_/// /_\ / //_// / //_'/ // Duration: 0.004 CPU time: 0.004
/ _/ v4.4.0
Program: /opt/homebrew/Cellar/python@3.10/3.10.10_1/libexec/bin/python3
0.004 MainThread <thread>:8553529664
|- 0.002 [self] None
`- 0.002 <module> <string>:1
`- 0.002 minimize scipy/optimize/_minimize.py:45
`- 0.001 _minimize_bfgs scipy/optimize/_optimize.py:1318
|- 0.001 _line_search_wolfe12 scipy/optimize/_optimize.py:1144
| `- 0.001 line_search_wolfe1 scipy/optimize/_linesearch.py:31
| |- 0.001 scalar_search_wolfe1 scipy/optimize/_linesearch.py:91
| | |- 0.000 phi scipy/optimize/_linesearch.py:73
| | | `- 0.000 ScalarFunction.fun scipy/optimize/_differentiable_functions.py:264
| | | |- 0.000 ScalarFunction._update_fun scipy/optimize/_differentiable_functions.py:249
| | | | `- 0.000 update_fun scipy/optimize/_differentiable_functions.py:154
| | | | `- 0.000 fun_wrapped scipy/optimize/_differentiable_functions.py:132
| | | | |- 0.000 f <string>:2
| | | | | `- 0.000 sum <__array_function__ internals>:177
| | | | | `- 0.000 sum numpy/core/fromnumeric.py:2188
| | | | | |- 0.000 [self] None
| | | | | `- 0.000 _wrapreduction numpy/core/fromnumeric.py:69
| | | | `- 0.000 [self] None
| | | |- 0.000 array_equal <__array_function__ internals>:177
| | | | `- 0.000 array_equal numpy/core/numeric.py:2407
| | | | `- 0.000 _all numpy/core/_methods.py:61
| | | `- 0.000 update_x scipy/optimize/_differentiable_functions.py:240
| | |- 0.000 derphi scipy/optimize/_linesearch.py:77
| | | |- 0.000 ScalarFunction.grad scipy/optimize/_differentiable_functions.py:270
| | | | |- 0.000 ScalarFunction._update_grad scipy/optimize/_differentiable_functions.py:254
| | | | | `- 0.000 update_grad scipy/optimize/_differentiable_functions.py:166
| | | | | `- 0.000 grad_wrapped scipy/optimize/_differentiable_functions.py:162
| | | | | `- 0.000 gradient <string>:8
| | | | | `- 0.000 [self] None
| | | | `- 0.000 array_equal <__array_function__ internals>:177
| | | `- 0.000 [self] None
| | `- 0.000 [self] None
| `- 0.000 dot <__array_function__ internals>:177
| `- 0.000 [self] None
|- 0.000 vecnorm scipy/optimize/_optimize.py:235
| |- 0.000 amax <__array_function__ internals>:177
| | `- 0.000 amax numpy/core/fromnumeric.py:2703
| | `- 0.000 _wrapreduction numpy/core/fromnumeric.py:69
| | `- 0.000 ufunc.reduce None
| |- 0.000 [self] None
| `- 0.000 sum <__array_function__ internals>:177
|- 0.000 [self] None
`- 0.000 _prepare_scalar_function scipy/optimize/_optimize.py:244
`- 0.000 ScalarFunction.__init__ scipy/optimize/_differentiable_functions.py:86
`- 0.000 ScalarFunction._update_fun scipy/optimize/_differentiable_functions.py:249
`- 0.000 update_fun scipy/optimize/_differentiable_functions.py:154
`- 0.000 fun_wrapped scipy/optimize/_differentiable_functions.py:132
`- 0.000 f <string>:2
from pyinstrument import Profiler
f, grad, hess = mk_quad(0.7)
profiler = Profiler(interval=0.00001)
profiler.start()
opt = optimize.minimize(fun = f, x0 = (1.6, 1.1), method="Nelder-Mead", tol=1e-11)
p = profiler.stop()
profiler.print(show_all=True)
_ ._ __/__ _ _ _ _ _/_ Recorded: 10:56:16 Samples: 558
/_//_/// /_\ / //_// / //_'/ // Duration: 0.007 CPU time: 0.007
/ _/ v4.4.0
Program: /opt/homebrew/Cellar/python@3.10/3.10.10_1/libexec/bin/python3
0.007 MainThread <thread>:8553529664
|- 0.005 <module> <string>:1
| `- 0.005 minimize scipy/optimize/_minimize.py:45
| `- 0.005 _minimize_neldermead scipy/optimize/_optimize.py:708
| |- 0.002 [self] None
| |- 0.002 function_wrapper scipy/optimize/_optimize.py:564
| | |- 0.001 f <string>:2
| | | |- 0.000 sum <__array_function__ internals>:177
| | | | `- 0.000 sum numpy/core/fromnumeric.py:2188
| | | | |- 0.000 _wrapreduction numpy/core/fromnumeric.py:69
| | | | | |- 0.000 [self] None
| | | | | |- 0.000 ufunc.reduce None
| | | | | `- 0.000 <dictcomp> numpy/core/fromnumeric.py:70
| | | | `- 0.000 [self] None
| | | `- 0.000 [self] None
| | |- 0.000 copy <__array_function__ internals>:177
| | | |- 0.000 copy numpy/lib/function_base.py:871
| | | | `- 0.000 array None
| | | `- 0.000 [self] None
| | |- 0.000 [self] None
| | `- 0.000 isscalar numpy/core/numeric.py:1878
| | `- 0.000 [self] None
| |- 0.000 take <__array_function__ internals>:177
| | |- 0.000 take numpy/core/fromnumeric.py:93
| | | `- 0.000 _wrapfunc numpy/core/fromnumeric.py:51
| | | `- 0.000 ndarray.take None
| | `- 0.000 [self] None
| |- 0.000 ravel <__array_function__ internals>:177
| | |- 0.000 ravel numpy/core/fromnumeric.py:1781
| | | `- 0.000 [self] None
| | `- 0.000 [self] None
| |- 0.000 amax <__array_function__ internals>:177
| | `- 0.000 amax numpy/core/fromnumeric.py:2703
| | `- 0.000 _wrapreduction numpy/core/fromnumeric.py:69
| | |- 0.000 ufunc.reduce None
| | `- 0.000 [self] None
| `- 0.000 argsort <__array_function__ internals>:177
`- 0.002 [self] None
optimize.minimize() output message: Optimization terminated successfully.
success: True
status: 0
fun: 1.2739256453438974e-11
x: [-5.318e-07 -8.843e-06]
nit: 6
jac: [-3.510e-07 -2.860e-06]
hess_inv: [[ 1.515e+00 -3.438e-03]
[-3.438e-03 3.035e+00]]
nfev: 7
njev: 7
optimize.minimize() output (cont.) message: Optimization terminated successfully.
success: True
status: 0
fun: 2.3077013477040082e-10
x: [ 1.088e-05 3.443e-05]
nit: 46
nfev: 89
final_simplex: (array([[ 1.088e-05, 3.443e-05],
[ 1.882e-05, -3.825e-05],
[-3.966e-05, -3.147e-05]]), array([ 2.308e-10, 3.534e-10, 6.791e-10]))
/opt/homebrew/lib/python3.10/site-packages/scipy/optimize/_minimize.py:549: RuntimeWarning: Method Nelder-Mead does not use gradient information (jac).
warn('Method %s does not use gradient information (jac).' % method,
def run_collect(name, x0, cost_func, *args, tol=1e-8, skip=[]):
f, grad, hess = cost_func(*args)
methods = define_methods(x0, f, grad, hess, tol)
res = []
for method in methods:
if method in skip: continue
x = methods[method]()
d = {
"name": name,
"method": method,
"nit": x["nit"],
"nfev": x["nfev"],
"njev": x.get("njev"),
"nhev": x.get("nhev"),
"success": x["success"]
#"message": x["message"]
}
res.append( pd.DataFrame(d, index=[1]) )
return pd.concat(res)df = pd.concat([
run_collect(name, (1.6, 1.1), cost_func, arg, skip=['naive_newton', 'naive_cg'])
for name, cost_func, arg in zip(
("Well-cond quad", "Ill-cond quad", "Rosenbrock"),
(mk_quad, mk_quad, mk_rosenbrock),
(0.7, 0.02, None)
)
])
df name method nit nfev njev nhev success
1 Well-cond quad CG 2 5 5 None True
1 Well-cond quad newton-cg 5 6 13 0 True
1 Well-cond quad newton-cg w/ H 15 15 15 15 True
1 Well-cond quad bfgs 8 9 9 None True
1 Well-cond quad bfgs w/o G 8 27 9 None True
1 Well-cond quad l-bfgs-b 6 21 7 None True
1 Well-cond quad nelder-mead 76 147 None None True
1 Ill-cond quad CG 9 17 17 None True
1 Ill-cond quad newton-cg 3 4 9 0 True
1 Ill-cond quad newton-cg w/ H 54 106 106 54 True
1 Ill-cond quad bfgs 5 11 11 None True
1 Ill-cond quad bfgs w/o G 5 33 11 None True
1 Ill-cond quad l-bfgs-b 5 30 10 None True
1 Ill-cond quad nelder-mead 102 198 None None True
1 Rosenbrock CG 17 52 48 None True
1 Rosenbrock newton-cg 18 22 60 0 True
1 Rosenbrock newton-cg w/ H 17 21 21 17 True
1 Rosenbrock bfgs 23 26 26 None True
1 Rosenbrock bfgs w/o G 23 78 26 None True
1 Rosenbrock l-bfgs-b 19 75 25 None True
1 Rosenbrock nelder-mead 96 183 None None True
Try minimizing the following function using different optimization methods starting from \(x_0 = [0,0]\), which method(s) appear to work best?
\[ \begin{align} f(x) = \exp(x_1-1) + \exp(-x_2+1) + (x_1-x_2)^2 \end{align} \]
For an \(n\)-dimensional multivariate normal we define
the \(n \times 1\) vectors \(x\) and \(\mu\) and the \(n \times n\)
covariance matrix \(\Sigma\),
\[ \begin{align} f(x) =& \det(2\pi\Sigma)^{-1/2} \\ & \exp \left[-\frac{1}{2} (x-\mu)^T \Sigma^{-1} (x-\mu) \right] \\ \\ \nabla f(x) =& -f(x) \Sigma^{-1}(x-\mu) \\ \\ \nabla^2 f(x) =& f(x) \left( \Sigma^{-1}(x-\mu)(x-\mu)^T\Sigma^{-1} - \Sigma^{-1}\right) \\ \end{align} \]
Our goal will be to find the mode (maximum) of this density.
def mk_mvn(mu, Sigma):
Sigma_inv = np.linalg.inv(Sigma)
norm_const = 1 / (np.sqrt(np.linalg.det(2*np.pi*Sigma)))
# Returns the negative density (since we want the max not min)
def f(x):
x_m = x - mu
return -(norm_const *
np.exp( -0.5 * (x_m.T @ Sigma_inv @ x_m).item() ))
def grad(x):
return (-f(x) * Sigma_inv @ (x - mu))
def hess(x):
n = len(x)
x_m = x - mu
return f(x) * ((Sigma_inv @ x_m).reshape((n,1)) @ (x_m.T @ Sigma_inv).reshape((1,n)) - Sigma_inv)
return f, grad, hessOne of the most common issues when implementing an optimizer is to get the gradient calculation wrong which can produce problematic results. It is possible to numerically check the gradient function by comparing results between the gradient function and finite differences from the objective function via optimize.check_grad().
2.634178031930877e-09
5.213238144735062e-10
2.8760747774580336e-12
2.850398669793798e-14
Note since the gradient of the gradient / jacobian is the hessian we can use this function to check our implementation of the hessian as well, just use grad() as func and hess() as grad.
name method nit nfev njev nhev success
1 5d CG 7 52 52 None True
1 5d newton-cg 3 7 10 0 True
1 5d bfgs 3 12 12 None True
1 5d l-bfgs-b 4 54 9 None True
1 5d nelder-mead 290 523 None None True
1 10d CG 2 24 24 None True
1 10d newton-cg 2 8 10 0 True
1 10d bfgs 3 19 19 None True
1 10d l-bfgs-b 3 110 10 None True
1 10d nelder-mead 1403 2000 None None False
1 20d CG 0 1 1 None True
1 20d newton-cg 1 1 1 0 True
1 20d bfgs 0 1 1 None True
1 20d l-bfgs-b 0 21 1 None True
1 20d nelder-mead 3217 4000 None None False
1 30d CG 0 1 1 None True
1 30d newton-cg 1 1 1 0 True
1 30d bfgs 0 1 1 None True
1 30d l-bfgs-b 0 31 1 None True
1 30d nelder-mead 5097 6000 None None False
def build_Sigma(n, corr=0.5):
S = np.full((n,n), corr)
np.fill_diagonal(S, 1)
return S
df = pd.concat([
run_collect(
name, np.ones(n), mk_mvn,
np.zeros(n), build_Sigma(n),
tol=1e-10,
skip=['naive_newton', 'naive_cg', 'bfgs w/o G', 'newton-cg w/ H']
)
for name, n in zip(
("5d", "10d", "20d", "30d"),
(5, 10, 20, 30)
)
]) name method nit nfev njev nhev success
1 5d CG 17 80 80 None True
1 5d newton-cg 4 6 10 0 True
1 5d bfgs 4 10 10 None True
1 5d l-bfgs-b 4 48 8 None True
1 5d nelder-mead 224 427 None None True
1 10d CG 4 31 31 None True
1 10d newton-cg 4 5 9 0 True
1 10d bfgs 2 12 12 None True
1 10d l-bfgs-b 3 77 7 None True
1 10d nelder-mead 1184 1802 None None True
1 20d CG 2 27 27 None True
1 20d newton-cg 1 2 3 0 True
1 20d bfgs 2 17 17 None True
1 20d l-bfgs-b 2 105 5 None True
1 20d nelder-mead 2745 4000 None None False
1 30d CG 1 18 18 None True
1 30d newton-cg 1 1 1 0 True
1 30d bfgs 1 18 18 None True
1 30d l-bfgs-b 1 93 3 None True
1 30d nelder-mead 4687 6000 None None False
message: Optimization terminated successfully.
success: True
status: 0
fun: -0.023337250777292103
x: [ 1.086e-07 1.061e-07 1.080e-07 1.080e-07 1.076e-07]
nit: 14
jac: [ 8.802e-10 7.646e-10 8.540e-10 8.532e-10 8.344e-10]
nfev: 67
njev: 67
message: Optimization terminated successfully.
success: True
status: 0
fun: -0.023337250777292328
x: [-2.232e-09 -3.779e-10 -1.811e-09 -1.797e-09 -1.496e-09]
nit: 17
jac: [-4.415e-11 4.237e-11 -2.453e-11 -2.387e-11 -9.827e-12]
nfev: 80
njev: 80
message: Desired error not necessarily achieved due to precision loss.
success: False
status: 2
fun: -0.02333725077729232
x: [ 2.205e-08 3.734e-09 1.790e-08 1.776e-08 1.478e-08]
nit: 16
jac: [ 4.362e-10 -4.186e-10 2.424e-10 2.359e-10 9.710e-11]
nfev: 93
njev: 81
message: Optimization terminated successfully.
success: True
status: 0
fun: -2.3301915597315495e-06
x: [-4.506e-05 -4.506e-05 ... -4.506e-05 -4.506e-05]
nit: 2884
jac: [-9.999e-12 -9.999e-12 ... -9.999e-12 -9.999e-12]
nfev: 66313
njev: 66313
Minimization of scalar function of one or more variables using the
BFGS algorithm.
Options
-------
disp : bool
Set to True to print convergence messages.
maxiter : int
Maximum number of iterations to perform.
gtol : float
Terminate successfully if gradient norm is less than `gtol`.
norm : float
Order of norm (Inf is max, -Inf is min).
eps : float or ndarray
If `jac is None` the absolute step size used for numerical
approximation of the jacobian via forward differences.
return_all : bool, optional
Set to True to return a list of the best solution at each of the
iterations.
finite_diff_rel_step : None or array_like, optional
If `jac in ['2-point', '3-point', 'cs']` the relative step size to
use for numerical approximation of the jacobian. The absolute step
size is computed as ``h = rel_step * sign(x) * max(1, abs(x))``,
possibly adjusted to fit into the bounds. For ``method='3-point'``
the sign of `h` is ignored. If None (default) then step is selected
automatically.
xrtol : float, default: 0
Relative tolerance for `x`. Terminate successfully if step size is
less than ``xk * xrtol`` where ``xk`` is the current parameter vector.
Minimization of scalar function of one or more variables using the
Nelder-Mead algorithm.
Options
-------
disp : bool
Set to True to print convergence messages.
maxiter, maxfev : int
Maximum allowed number of iterations and function evaluations.
Will default to ``N*200``, where ``N`` is the number of
variables, if neither `maxiter` or `maxfev` is set. If both
`maxiter` and `maxfev` are set, minimization will stop at the
first reached.
return_all : bool, optional
Set to True to return a list of the best solution at each of the
iterations.
initial_simplex : array_like of shape (N + 1, N)
Initial simplex. If given, overrides `x0`.
``initial_simplex[j,:]`` should contain the coordinates of
the jth vertex of the ``N+1`` vertices in the simplex, where
``N`` is the dimension.
xatol : float, optional
Absolute error in xopt between iterations that is acceptable for
convergence.
fatol : number, optional
Absolute error in func(xopt) between iterations that is acceptable for
convergence.
adaptive : bool, optional
Adapt algorithm parameters to dimensionality of problem. Useful for
high-dimensional minimization [1]_.
bounds : sequence or `Bounds`, optional
Bounds on variables. There are two ways to specify the bounds:
1. Instance of `Bounds` class.
2. Sequence of ``(min, max)`` pairs for each element in `x`. None
is used to specify no bound.
Note that this just clips all vertices in simplex based on
the bounds.
References
----------
.. [1] Gao, F. and Han, L.
Implementing the Nelder-Mead simplex algorithm with adaptive
parameters. 2012. Computational Optimization and Applications.
51:1, pp. 259-277
The following code comes from SciPy’s minimize() implementation:
if tol is not None:
options = dict(options)
if meth == 'nelder-mead':
options.setdefault('xatol', tol)
options.setdefault('fatol', tol)
if meth in ('newton-cg', 'powell', 'tnc'):
options.setdefault('xtol', tol)
if meth in ('powell', 'l-bfgs-b', 'tnc', 'slsqp'):
options.setdefault('ftol', tol)
if meth in ('bfgs', 'cg', 'l-bfgs-b', 'tnc', 'dogleg',
'trust-ncg', 'trust-exact', 'trust-krylov'):
options.setdefault('gtol', tol)
if meth in ('cobyla', '_custom'):
options.setdefault('tol', tol)
if meth == 'trust-constr':
options.setdefault('xtol', tol)
options.setdefault('gtol', tol)
options.setdefault('barrier_tol', tol)Having access to the gradient is almost always helpful / necessary
Having access to the hessian can be helpful, but usually does not significantly improve things
The curse of dimensionality is real
tol - it means different things for different methodsIn general, BFGS or L-BFGS should be a first choice for most problems (either well- or ill-conditioned)
array([-2.61, -4.69, -1.41, -3.59, -4.1 , -2.09, -2.13, -4. , -3.18,
-6. , -1.76, -1.96, -2.01, -5.73, -3.62, -3.2 , -2.69, -2.84,
-1.55, -5.13, -3.45, -4.02, -2.96, -2.51, -1.55, -3.79, -2.36,
-5.47, -3.43, -1.88, -3.7 , -2.78, -1.89, -1.89, -2.12, -3.35,
-3.04, -3.6 , -2.15, -0.21, -3.1 , -3.91, -3.15, -5.79, -2.89,
-4.32, -3.37, -3.18, -2.26, -2.93, -2.15, -5.01, -4.95, -3.33,
-3.89, -3.38, -2.76, -3.24, -2.49, -1.27, -4.42, -3.29, -2.82,
-3.46, -1.91, -6.2 , -0.66, -4.63, -2.94, -2.32, -4.18, -2.62,
-2.32, -2.55, -4.36, -0.69, -2.92, -4.64, -2.41, -3.15, -2.62,
-7.65, -1.55, -3.01, -2.99, -3.74, -2.24, -1.97, -2.86, -1.46,
-3.1 , -3.7 , -4.48, -3.93, -2.18, -3.3 , -3.63, -2.54, -4.54,
-3.84])
{'µ': -3.156109646093205, 'σ': 1.2446060629192535}
message: Optimization terminated successfully.
success: True
status: 0
fun: 163.77575977255518
x: [-3.156e+00 1.245e+00]
nit: 9
jac: [-1.907e-06 0.000e+00]
hess_inv: [[ 1.475e-02 -1.069e-04]
[-1.069e-04 7.734e-03]]
nfev: 47
njev: 15
/opt/homebrew/lib/python3.10/site-packages/scipy/optimize/_numdiff.py:576: RuntimeWarning: invalid value encountered in subtract
df = fun(x) - f0
It is also possible to specify bounds via bounds but this is only available for certain optimization methods.
message: CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH
success: True
status: 0
fun: 163.7757597728758
x: [-3.156e+00 1.245e+00]
nit: 10
jac: [ 2.075e-04 0.000e+00]
nfev: 69
njev: 23
hess_inv: <2x2 LbfgsInvHessProduct with dtype=float64>
Using optimize.minimize() recover the shape and scale parameters for these data using MLE.
from scipy.stats import gamma
g = gamma(a=2.0, scale=2.0)
x = g.rvs(size=100, random_state=1234)
x.round(2)array([ 4.7 , 1.11, 1.8 , 6.19, 3.37, 0.25, 6.45, 0.36, 4.49,
4.14, 2.84, 1.91, 8.03, 2.26, 2.88, 6.88, 6.84, 6.83,
6.1 , 3.03, 3.67, 2.57, 3.53, 2.07, 4.01, 1.51, 5.69,
3.92, 6.01, 0.82, 2.11, 2.97, 5.02, 9.13, 4.19, 2.82,
11.81, 1.17, 1.69, 4.67, 1.47, 11.67, 5.25, 3.44, 8.04,
3.74, 5.73, 6.58, 3.54, 2.4 , 1.32, 2.04, 2.52, 4.89,
4.14, 5.02, 4.75, 8.24, 7.6 , 1. , 6.14, 0.58, 2.83,
2.88, 5.42, 0.5 , 3.46, 4.46, 1.86, 4.59, 2.24, 2.62,
3.99, 3.74, 5.27, 1.42, 0.56, 7.54, 5.5 , 1.58, 5.49,
6.57, 4.79, 5.84, 8.21, 1.66, 1.53, 4.27, 2.57, 1.48,
5.23, 3.84, 3.15, 2.1 , 3.71, 2.79, 0.86, 8.52, 4.36,
3.3 ])
Sta 663 - Spring 2023